library(tidyverse)
library(ipumsr)
library(srvyr)
library(haven)
library(ipumsr)
library(tidyverse)
ddi <- read_ipums_ddi("cps_00002.xml")
data <- read_ipums_micro(ddi)
## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
glimpse(data)
## Rows: 212,608
## Columns: 26
## $ YEAR <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, …
## $ SERIAL <dbl> 1, 2, 2, 2, 3, 5, 5, 5, 6, 7, 7, 8, 9, 9, 11, 11, 12, 12, 14…
## $ MONTH <int+lbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ HWTFINL <dbl> 1796.811, 2923.587, 2923.587, 2923.587, 1792.269, 1912.517, …
## $ CPSID <dbl> 2.02003e+13, 2.02002e+13, 2.02002e+13, 2.02002e+13, 2.02001e…
## $ PERNUM <dbl> 1, 1, 2, 3, 1, 1, 2, 3, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, …
## $ WTFINL <dbl> 1796.811, 2923.587, 3618.171, 4634.866, 1792.269, 1912.517, …
## $ CPSIDP <dbl> 2.02003e+13, 2.02002e+13, 2.02002e+13, 2.02002e+13, 2.02001e…
## $ RELATE <int+lbl> 101, 101, 301, 501, 101, 101, 1260, 1260, 101, 101, 202,…
## $ AGE <int+lbl> 72, 21, 1, 27, 59, 67, 33, 33, 48, 80, 80, 80, 44, 19, 7…
## $ SEX <int+lbl> 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2,…
## $ RACE <int+lbl> 100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 1…
## $ MARST <int+lbl> 4, 4, 9, 6, 5, 3, 4, 6, 6, 1, 1, 6, 6, 6, 1, 1, 1, 1, 4,…
## $ POPSTAT <int+lbl> 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ EMPSTAT <int+lbl> 10, 21, 0, 21, 10, 10, 34, 34, 10, 36, 10, 36, 10, 34, 1…
## $ LABFORCE <int+lbl> 2, 2, 0, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1,…
## $ OCC <dbl> 2205, 4720, 0, 710, 3255, 2040, 0, 0, 3930, 0, 4920, 0, 9645…
## $ IND <dbl> 7870, 8680, 0, 4670, 8191, 9160, 0, 0, 4971, 0, 7071, 0, 629…
## $ UHRSWORKT <int+lbl> 40, 999, 999, 999, 40, 65, 999, 999, 997, 999, 997, 999,…
## $ UHRSWORK1 <int+lbl> 40, 999, 999, 999, 40, 65, 999, 999, 997, 999, 997, 999,…
## $ UHRSWORK2 <int+lbl> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 9…
## $ AHRSWORKT <dbl+lbl> 40, 999, 999, 999, 40, 40, 999, 999, 45, 999, 24, 9…
## $ AHRSWORK1 <int+lbl> 40, 999, 999, 999, 40, 40, 999, 999, 45, 999, 24, 999, 4…
## $ AHRSWORK2 <int+lbl> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 9…
## $ ABSENT <int+lbl> 0, 2, 0, 2, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 3, 1, 0, 0, 1,…
## $ WKSTAT <int+lbl> 11, 60, 99, 50, 11, 11, 99, 99, 11, 99, 41, 99, 11, 99, …
#2.
library(srvyr)
cps_svy <- as_survey_design(data, weights = WTFINL)
class(data)
## [1] "tbl_df" "tbl" "data.frame"
class(cps_svy)
## [1] "tbl_svy" "survey.design2" "survey.design"
#3. The difference is that, the as_survey_design function turn the tibble into a “tbl_svy”, object, and create two new catagories called “survey.design” & “survey.design2”
#4. Yes, there is a value of NIU: #for UHRSWORKT, 1999-onward ASEC:0 = No hours; 997 = Hours vary; 999 = Not in universe (NIU) #for AHRSWORKT, 999 = NIU (Not in universe)
NIU.UHR.AHR <- data %>%
group_by(YEAR, UHRSWORKT, AHRSWORKT) %>%
summarise(
unweighted = n()
) %>%
filter(UHRSWORKT == 999 | AHRSWORKT == 999) %>%
arrange(YEAR, desc(UHRSWORKT))
NIU.UHR.AHR
#observed that for all observations in 2020 and 2021, while UHRSWORKT == 999, then AHRSWORKT is inevitabily 999, that means to sum up the observations that are not in the universe in these two years, we could only group_by the AHRSWORKT variable. Rewrite our code:
NIU.UHR.AHR <- data %>%
group_by(YEAR, AHRSWORKT) %>%
summarise(
unweighted = n()
) %>%
filter(AHRSWORKT == 999) %>%
arrange(YEAR, desc(AHRSWORKT))
NIU.UHR.AHR
#Thus, in 2020, there were 63321 observations that are not in the universe for UHRSWORKT and AHRSWORKT; in 2021, there were 61703 of them.
#5.Filter out observations not in the universe for UHRSWORKT and create a new data frame called cps_subset_svy with the assignment operator.
cps_subset_svy <- data %>%
filter(UHRSWORKT != 999)
cps_subset_svy
#6.Count the number of unweighted responses for each value for UHRSWORKT.
cps_subset_6 <- cps_subset_svy %>%
group_by(UHRSWORKT) %>%
filter(UHRSWORKT != 997) %>%
summarize (
unweighted = n()
)
ggplot(data = cps_subset_6) +
geom_col(mapping = aes(x = UHRSWORKT, y = unweighted))
#7.Calculate the mean usual hours worked (UHRSWORKT) in 2020 and 2021. Exclude workers with “Hours vary”.
cps_subset_7 <- cps_subset_svy %>%
group_by(YEAR) %>%
filter(UHRSWORKT != 997) %>%
summarise(mean_UHRSWORKT = mean(UHRSWORKT))
cps_subset_7
#8.Calculate the proportion of workers who usually worked exactly 40 hours in 2021.
cps_subset_8 <- cps_subset_svy %>%
filter(YEAR == 2021) %>%
mutate(
HOURS_40 = if_else(condition = UHRSWORKT == 40, true = 1, false = 0)
) %>%
group_by(YEAR) %>%
summarise(proportion_40 = mean(HOURS_40))
cps_subset_8
#9.Calculate the proportion of workers who worked less, the same, and more than usual in April 2020 and April 2021 (separately) by comparing UHRSWORKT and AHRSWORKT.
cps_subset_91 <- cps_subset_svy %>%
filter(UHRSWORKT != 997) %>%
select(YEAR, UHRSWORKT, AHRSWORKT) %>%
mutate(
AprilWork = case_when(
AHRSWORKT > UHRSWORKT ~ "work more",
AHRSWORKT < UHRSWORKT ~ "work less",
TRUE ~ "work the same"
)
)
cps_subset_92 <- as_survey_design(cps_subset_91) %>%
group_by(YEAR, AprilWork) %>%
summarise(prop = survey_prop())
cps_subset_92
#2. Using cps_svy, calculate the weighted count of observations ages 16 or older for each year with survey_count(). Your results should match the April 2020 and April 2021 values to the closest thousand because of rounding. #Sorry I don’t have any idea about “survey_count”, it seems not appear when I was trying to search it. I also don’t know how to put the age >= 16 into weight.
#3. Create three numeric indicator variables.
cps_svy_33 <- cps_svy %>%
mutate(
labor_force = case_when(
LABFORCE == 2 ~ 1,
LABFORCE == 1 ~ 0,
TRUE ~ 999
)
) %>%
mutate(
employed = case_when(
EMPSTAT == 10 | EMPSTAT == 12 ~ 1,
EMPSTAT == 20 | EMPSTAT == 21 | EMPSTAT == 22 ~ 0,
TRUE ~ 999
)
) %>%
mutate(
unemployed = case_when(
EMPSTAT == 10 | EMPSTAT == 12 ~ 0,
EMPSTAT == 20 | EMPSTAT == 21 | EMPSTAT == 22 ~ 1,
TRUE ~ 999
)
)
#4.Filter to the civilian population ages 16 or older, and calculate the relavant population.
cps_svy_34 <- cps_svy_33 %>%
filter(AGE >= 16) %>%
filter(POPSTAT == 1) %>%
summarise(
LABFOR_cal = survey_total(labor_force == 1),
EMPLOY_cal = survey_total(employed == 1),
UNEMPLOY_cal = survey_total(unemployed == 1)
)
cps_svy_34
#5. Compare the result with microdata to the official tabulation for April 2021. #The official data is: civilian labor force - 160,379,000; employed - 151,160,000; unemployed - 9,220,000. #What I got here are (shown above): civilian labor force - 316,601,435; employed - 284,601,546; unemployed - 31,999,889. They are far from “close”, thus there might be some problems in my code.
#1.-3
cps2021 <- filter(data, YEAR == 2021)
cps2021_sum <- cps2021 %>%
summarise(n())
cps2021_sum
nber <- read_dta("cpsb202104.dta")
nber_sum <- nber %>%
select(hufinal, pwcmpwgt) %>%
summarise(n())
nber_sum
#3. Notice that n = 133,449. There are 133,449 observations in dataset “nber”, while there are 111,003 observations in dataset “cps2021”.
#4. As it shows, there are 111,003 observations in nber_44, which match with the amount of the observations in dataset “cps2021”.
nber_44 <- filter(nber, hufinal <= 205)
summarise(nber_44, n())
#5. Combine the columns.
nber_cps <- bind_cols(cps2021, nber_44)
nber_cps